In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
PLOTLY LIBRARIES¶
In [2]:
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
import plotly.subplots as make_subplots
In [3]:
df = pd.read_csv('heart_disease_uci.csv')
In [4]:
df.head()
Out[4]:
| id | age | sex | dataset | cp | trestbps | chol | fbs | restecg | thalch | exang | oldpeak | slope | ca | thal | num | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 63 | Male | Cleveland | typical angina | 145.0 | 233.0 | True | lv hypertrophy | 150.0 | False | 2.3 | downsloping | 0.0 | fixed defect | 0 |
| 1 | 2 | 67 | Male | Cleveland | asymptomatic | 160.0 | 286.0 | False | lv hypertrophy | 108.0 | True | 1.5 | flat | 3.0 | normal | 2 |
| 2 | 3 | 67 | Male | Cleveland | asymptomatic | 120.0 | 229.0 | False | lv hypertrophy | 129.0 | True | 2.6 | flat | 2.0 | reversable defect | 1 |
| 3 | 4 | 37 | Male | Cleveland | non-anginal | 130.0 | 250.0 | False | normal | 187.0 | False | 3.5 | downsloping | 0.0 | normal | 0 |
| 4 | 5 | 41 | Female | Cleveland | atypical angina | 130.0 | 204.0 | False | lv hypertrophy | 172.0 | False | 1.4 | upsloping | 0.0 | normal | 0 |
In [5]:
df.describe()
Out[5]:
| id | age | trestbps | chol | thalch | oldpeak | ca | num | |
|---|---|---|---|---|---|---|---|---|
| count | 920.000000 | 920.000000 | 861.000000 | 890.000000 | 865.000000 | 858.000000 | 309.000000 | 920.000000 |
| mean | 460.500000 | 53.510870 | 132.132404 | 199.130337 | 137.545665 | 0.878788 | 0.676375 | 0.995652 |
| std | 265.725422 | 9.424685 | 19.066070 | 110.780810 | 25.926276 | 1.091226 | 0.935653 | 1.142693 |
| min | 1.000000 | 28.000000 | 0.000000 | 0.000000 | 60.000000 | -2.600000 | 0.000000 | 0.000000 |
| 25% | 230.750000 | 47.000000 | 120.000000 | 175.000000 | 120.000000 | 0.000000 | 0.000000 | 0.000000 |
| 50% | 460.500000 | 54.000000 | 130.000000 | 223.000000 | 140.000000 | 0.500000 | 0.000000 | 1.000000 |
| 75% | 690.250000 | 60.000000 | 140.000000 | 268.000000 | 157.000000 | 1.500000 | 1.000000 | 2.000000 |
| max | 920.000000 | 77.000000 | 200.000000 | 603.000000 | 202.000000 | 6.200000 | 3.000000 | 4.000000 |
In [6]:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 920 entries, 0 to 919 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 920 non-null int64 1 age 920 non-null int64 2 sex 920 non-null object 3 dataset 920 non-null object 4 cp 920 non-null object 5 trestbps 861 non-null float64 6 chol 890 non-null float64 7 fbs 830 non-null object 8 restecg 918 non-null object 9 thalch 865 non-null float64 10 exang 865 non-null object 11 oldpeak 858 non-null float64 12 slope 611 non-null object 13 ca 309 non-null float64 14 thal 434 non-null object 15 num 920 non-null int64 dtypes: float64(5), int64(3), object(8) memory usage: 115.1+ KB
In [7]:
df.isna().sum()
Out[7]:
id 0 age 0 sex 0 dataset 0 cp 0 trestbps 59 chol 30 fbs 90 restecg 2 thalch 55 exang 55 oldpeak 62 slope 309 ca 611 thal 486 num 0 dtype: int64
In [8]:
df.dropna(inplace = True)
In [9]:
df.isna().sum()
Out[9]:
id 0 age 0 sex 0 dataset 0 cp 0 trestbps 0 chol 0 fbs 0 restecg 0 thalch 0 exang 0 oldpeak 0 slope 0 ca 0 thal 0 num 0 dtype: int64
In [10]:
df.head(2)
Out[10]:
| id | age | sex | dataset | cp | trestbps | chol | fbs | restecg | thalch | exang | oldpeak | slope | ca | thal | num | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 63 | Male | Cleveland | typical angina | 145.0 | 233.0 | True | lv hypertrophy | 150.0 | False | 2.3 | downsloping | 0.0 | fixed defect | 0 |
| 1 | 2 | 67 | Male | Cleveland | asymptomatic | 160.0 | 286.0 | False | lv hypertrophy | 108.0 | True | 1.5 | flat | 3.0 | normal | 2 |
1️. Age vs Cholesterol for Top 4 Chest Pain Types¶
In [13]:
top_leagues = df['cp'].value_counts().nlargest().index
display(top_leagues)
plt.figure(figsize=(13,6))
ax = sns.scatterplot(x='age',y='chol',data = df[df['cp'].isin(top_leagues)],hue='cp')
plt.xlabel("Age")
plt.ylabel("Cholestrol")
plt.title("Age vs Cholestrol for Top 4 Chest pain")
plt.legend(title = "Chest Pain Type",bbox_to_anchor=(1.05, 1), loc='upper left')
ax.set_facecolor("#ffffcc")
plt.show()
Index(['asymptomatic', 'non-anginal', 'atypical angina', 'typical angina'], dtype='object', name='cp')
2. Cholesterol vs Age (Colored by Chest Pain Type)¶
In [12]:
fig = px.scatter(data_frame = df,
x="age",
y="chol",
color="cp",
size='ca',
hover_data=['oldpeak'])
fig.update_layout(title_text="<b> Cholesterol Vs Age <b>",
titlefont={'size': 24, 'family':'Serif'},
width=1000,
height=500,
)
fig.show()
3. Scatter Plot of Cholesterol vs Age (Colored by Sex)¶
In [16]:
import plotly.express as px
fig = px.scatter(df , x='chol' , y='age' , color='sex')
fig.update_layout(width = 1000 , height = 500)
fig.update_layout(title_text = 'Scatter plot of Cholestrol vs Age (colored by Sex)')
fig.show()
4. Scatter Plot of Cholesterol vs Age (Faceted by Chest Pain Type, with Size for Oldpeak)¶
In [17]:
import plotly.express as px
fig = px.scatter(
df,
x='chol',
y='age',
color='cp',
size='oldpeak',
size_max=30,
hover_name='exang',
facet_col='cp'
)
fig.update_layout(width=1000, height=500)
fig.update_layout(title_text='Scatter Plot of Cholesterol vs. Age (colored by cp)')
fig.show()
5️. Chest Pain Type vs Slope (Bar Chart)¶
In [18]:
def generate_rating_df(df):
rating_df = df.groupby(['cp', 'slope']).agg({'id': 'count'}).reset_index()
rating_df = rating_df[rating_df['id'] != 0]
rating_df.columns = ['cp', 'slope', 'counts']
rating_df = rating_df.sort_values('slope')
return rating_df
rating_df = generate_rating_df(df)
fig = px.bar(rating_df, x='cp', y='counts', color='slope')
fig.update_traces(textposition='auto',
textfont_size = 20)
fig.update_layout(width=500 , height=500)
fig.update_layout(barmode='stack')
6️. Chest Pain vs Gender¶
In [19]:
fig = px.scatter(data_frame = df ,
x='age',
y='chol',
size='ca',
size_max=30,
color='sex',
trendline='ols',
trendline_scope='overall',
trendline_color_override='black')
fig.update_layout(title_text="<b>Chest Pain vs Gender<b>",
titlefont={'size':24 , 'family':'Serif'},
width=1000,
height=500,
)
fig.show()
7️. Distribution of Age¶
In [20]:
fig = px.histogram(df,x='age',height=500 , width = 900 , template = 'simple_white',
color = 'sex',
color_discrete_sequence = ['salmon','lightblue'])
fig.update_layout(title={'text':'Histogram of Persons by Age','font':{'size':25}}
,title_font_family="Times New Roman",
title_font_color="darkgrey",
title_x=0.2)
fig.update_layout(
font_family='classic-roman',
font_color= 'grey',
yaxis_title={'text': " count", 'font': {'size':18}},
xaxis_title={'text': " Age", 'font': {'size':18}}
)
fig.show()
8️. Distribution of Cholesterol Levels by Chest Pain Type¶
In [21]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
asymptomatic = df[df['cp'] == 'asymptomatic']
non_anginal = df[df['cp'] == 'non-anginal']
atypical_angina = df[df['cp'] == 'atypical angina']
typical_angina = df[df['cp'] == 'typical angina']
fig = make_subplots(rows=2, cols=2,
specs=[[{'type': 'domain'}, {'type': 'domain'}],
[{'type': 'domain'}, {'type': 'domain'}]],
subplot_titles=("Asymptomatic", "Non-Anginal",
"Atypical Angina", "Typical Angina"))
fig.add_trace(go.Pie(labels=asymptomatic["thal"], values=asymptomatic["chol"], name="asymptomatic"), 1, 1)
fig.add_trace(go.Pie(labels=non_anginal["thal"], values=non_anginal["chol"], name="non_anginal"), 1, 2)
fig.add_trace(go.Pie(labels=atypical_angina["thal"], values=atypical_angina["chol"], name="atypical_angina"), 2, 1) # Fixed
fig.add_trace(go.Pie(labels=typical_angina["thal"], values=typical_angina["chol"], name="typical_angina"), 2, 2)
fig.update_layout(
height=800,
width=1000,
title_text="Distribution of Cholesterol Levels by Chest Pain Type",
title_font_size=24
)
fig.update_traces(textposition='inside', textfont_size=16)
fig.update_annotations(font_size=20)
fig.show()
9. Cholesterol Variation with Age¶
In [22]:
fig = px.bar(df , x='age' , y='chol' , hover_data = ['oldpeak'] , color='sex' , height=400)
fig.show()
10. Correlation HeatMap¶
In [23]:
import plotly.figure_factory as ff
def format_title(title, subtitle=None, subtitle_font="Arial", subtitle_font_size=12):
title = f'<b>{title}</b>'
if not subtitle:
return title
subtitle = f'<span style="font-family: {subtitle_font}; font-size: {subtitle_font_size}px;">{subtitle}</span>'
return f'{title}<br>{subtitle}'
_ = df.groupby(['cp', 'thal']).chol.size().unstack().fillna(0) # Fill NaN values with 0
z = _.values.tolist()
x = _.columns.tolist()
y = _.index.tolist()
fig = ff.create_annotated_heatmap(
z=z,
x=x,
y=y,
xgap=3,
ygap=3,
colorscale=[[0, '#53354A'], [1, '#E84545']], # Defining scale explicitly
)
title = format_title('CP vs Thal', 'Chol Levels', 'Arial', 12)
fig.update_layout(
width = 800,
height = 400,
title_text=title,
title_x=0.5,
titlefont={'size': 24, 'family': 'Proxima Nova'},
template='plotly_dark',
paper_bgcolor='#2B2E4A',
plot_bgcolor='#2B2E4A',
xaxis={'side': 'bottom'},
xaxis_showgrid=False,
yaxis_showgrid=False,
yaxis_autorange='reversed'
)
fig.show()
11. Age Histogram¶
In [24]:
plt.figure(figsize = (8,5))
sns.kdeplot(df.age, shade = True, color = "r")
plt.title("Age Histogram", fontsize = 20)
plt.show()
print("Histogram's skewness is {} and kurtosis is {}".format(df.age.skew(), df.age.kurtosis()))
C:\Users\a3388\AppData\Local\Temp\ipykernel_8408\942704215.py:2: FutureWarning: `shade` is now deprecated in favor of `fill`; setting `fill=True`. This will become an error in seaborn v0.14.0; please update your code.
Histogram's skewness is -0.21485314045391055 and kurtosis is -0.5174882052116159
12. Chest Pain According to Gender¶
In [25]:
template = ['ggplot2','plotly_dark', 'seaborn', 'simple_white', 'plotly']
fig = px.histogram(df,
x="cp",
y=None,
color="sex",
width=1200,
height=450,
histnorm='percent',
color_discrete_map={
"male": "RebeccaPurple", "female": "lightsalmon"
},
template="plotly_dark"
)
fig.update_layout(title="Gender Chest Pain",
font_family="San Serif",
bargap=0.2,
barmode='group',
titlefont={'size': 24},
legend=dict(
orientation="v", y=1, yanchor="top", x=1.25, xanchor="right")
)
fig.show()
13. Exploring Relationships Between Age, Cholesterol, and Heart Rate by Chest Pain Type¶
In [26]:
sns.pairplot(df[['cp','age','chol','thalch']], hue='cp', aspect=1.5,dropna=True,palette='bright')
plt.show()
14. Cholesterol Trends by Age, Sex, and Chest Pain Type¶
In [27]:
import seaborn as sns
import matplotlib.pyplot as plt
heart_df_fg = sns.FacetGrid(
data=df,
col="sex",
hue="sex",
row="cp",
height=4,
aspect=1.3,
palette='Dark2',
col_order=["Male", "Female"]
)
heart_df_fg.map_dataframe(sns.regplot, "age", "chol")
# Use dark background
plt.style.use('default')
plt.show()
15. Sex-Wise Distribution of Key Heart Disease Indicators¶
In [28]:
plt.figure(figsize=(20,8))
for i,col in enumerate(['age','chol','oldpeak'],1):
plt.subplot(1,3,i)
ax = sns.barplot(x='sex' , y=col , data = df,color='gold')
plt.title(f'{col} Comparison')
plt.ylabel(col if i==1 else '')
# Adding count values above each bar
for i in range(len(ax.containers)):
ax.bar_label(ax.containers[i] , label_type = 'edge')
plt.show()